# Load API key and secret from environment variables
from dotenv import load_dotenv
load_dotenv()
# ML libraries
import pandas as pd
import xgboost as xgb
from numpy import argmax
from sklearn.metrics import accuracy_score, precision_recall_curve
from sklearn.model_selection import train_test_split
# ValidMind libraries
from sklearn.metrics import accuracy_score, precision_recall_curve
from sklearn.model_selection import train_test_split
import validmind as vm
from validmind.vm_models.test_context import TestContext
# Plotting libraries
import matplotlib.pyplot as plt
%matplotlib inlineCRE/MF Mortgage Rate Model On-ValidMind
Setup
Load Data
df = pd.read_csv("../datasets/lending_club_loan_rates.csv", sep='\t')
df = df.rename(columns={'Unnamed: 0': 'Date'})
df = df.set_index(pd.to_datetime(df['Date']))
df.drop(["Date"], axis=1, inplace=True)
df.head()| loan_rate_A | loan_rate_B | loan_rate_C | loan_rate_D | FEDFUNDS | diff1_loan_rate_A | diff1_loan_rate_B | diff1_loan_rate_C | diff1_loan_rate_D | diff1_FEDFUNDS | diff2_FEDFUNDS | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Date | |||||||||||
| 2007-08-01 | 7.766667 | 9.497692 | 10.947500 | 12.267000 | 5.02 | 0.060000 | 0.134359 | 0.207500 | -0.467444 | -0.24 | -0.25 |
| 2007-09-01 | 7.841429 | 9.276667 | 10.829167 | 12.436667 | 4.94 | 0.074762 | -0.221026 | -0.118333 | 0.169667 | -0.08 | 0.16 |
| 2007-10-01 | 7.830000 | 9.433333 | 10.825926 | 12.737368 | 4.76 | -0.011429 | 0.156667 | -0.003241 | 0.300702 | -0.18 | -0.10 |
| 2007-11-01 | 7.779091 | 9.467778 | 10.967037 | 12.609444 | 4.49 | -0.050909 | 0.034444 | 0.141111 | -0.127924 | -0.27 | -0.09 |
| 2007-12-01 | 7.695833 | 9.387500 | 10.805000 | 12.478889 | 4.24 | -0.083258 | -0.080278 | -0.162037 | -0.130556 | -0.25 | 0.02 |
Visual Inspection.
ValidMind Setup
Initialize ValidMind dataset.
vm.init(
api_host = "http://localhost:3000/api/v1/tracking",
api_key = "e22b89a6b9c2a27da47cb0a09febc001",
api_secret = "a61be901b5596e3c528d94231e4a3c504ef0bb803d16815f8dfd6857fac03e57",
project = "clgo0g0rt0000fjy6ozl9pb69"
)True
target_variables = ["loan_rate_A", "loan_rate_B", "loan_rate_C", "loan_rate_D"]
vm_dataset = vm.init_dataset(
dataset=df,
target_column = target_variables
)Pandas dataset detected. Initializing VM Dataset instance...
Inferring dataset types...
Create Train and Test datasets.
test_size = 30
train_ds = df[:-test_size]
test_ds = df[-test_size:]
vm_train_ds = vm.init_dataset(dataset=train_ds, type="generic", target_column=target_variables)
vm_test_ds = vm.init_dataset(dataset=test_ds, type="generic", target_column=target_variables)Pandas dataset detected. Initializing VM Dataset instance...
Inferring dataset types...
Pandas dataset detected. Initializing VM Dataset instance...
Inferring dataset types...
Visualize existing test plans.
vm.test_plans.list_plans()| ID | Name | Description |
|---|---|---|
| sklearn_classifier_metrics | SKLearnClassifierMetrics | Test plan for sklearn classifier metrics |
| sklearn_classifier_validation | SKLearnClassifierPerformance | Test plan for sklearn classifier models |
| sklearn_classifier | SKLearnClassifier | Test plan for sklearn classifier models that includes both metrics and validation tests |
| tabular_dataset | TabularDataset | Test plan for generic tabular datasets |
| tabular_dataset_description | TabularDatasetDescription | Test plan to extract metadata and descriptive statistics from a tabular dataset |
| tabular_data_quality | TabularDataQuality | Test plan for data quality on tabular datasets |
| normality_test_plan | NormalityTestPlan | Test plan to perform normality tests. |
| autocorrelation_test_plan | AutocorrelationTestPlan | Test plan to perform autocorrelation tests. |
| seasonality_test_plan | SesonalityTestPlan | Test plan to perform seasonality tests. |
| unit_root_test_plan | UnitRootTestPlan | Test plan to perform unit root tests. |
| stationarity_test_plan | StationarityTestPlan | Test plan to perform stationarity tests. |
| timeseries_test_plan | TimeSeriesTestPlan | Test plan for time series statsmodels that includes both metrics and validation tests |
| timeseries_univariate_inspection | TimeSeriesUnivariateInspection | Test plan to perform univariate inspection tests. |
4. Model Development
4.1. Development Data and Platform
4.1.2. Data Quality and Relevance
4.1.3. Data Process, Adjustments and Treatment
A. Missing Values Analysis
Step 1: Calculate the percentage of missing values in each column
Step 2: Display the missing values percentage in a table format
Step 3: Visualize the missing values
## B. Outliers Analysis
Step 1: Visualize the dataset using box plots
Visualize the data using box plots to get an initial sense of the presence of outliers.
Step 2: Calculate Z-scores
Step 3: Set a threshold and identify outliers
Set a threshold (e.g., 3) to identify data points with Z-scores higher than the threshold.
Step 4: Analyze the outliers
Analyze the outliers by looking at their frequency, index, and corresponding column.
C. Seasonality Analysis
Step 1: Seasonal decomposition
Perform seasonal decomposition on each time series.
from validmind.model_validation.statsmodels.metrics import SeasonalDecompose
test_context = TestContext(train_ds=vm_train_ds)
sd_metric = SeasonalDecompose(test_context=test_context)Step 2: Visualize seasonal decomposition
Create plots for observed, trend, seasonal and residual components.
sd_metric.run()
sd_metric.result.show()Logged the following evaluation metric to the ValidMind platform:
{'loan_rate_A': [{'Date': '2007-08-01', 'loan_rate_A': 7.7666666666666675, 'trend': nan, 'seasonal': -0.050284773390468364, 'resid': nan}, {'Date': '2007-09-01', 'loan_rate_A': 7.841428571428572, 'trend': nan, 'seasonal': -0.06087962072801926, 'resid': nan}, {'Date': '2007-10-01', 'loan_rate_A': 7.83, 'trend': nan, 'seasonal': 0.01749661199350169, 'resid': nan}, {'Date': '2007-11-01', 'loan_rate_A': 7.779090909090908, 'trend': nan, 'seasonal': -0.047258378330469565, 'resid': nan}, {'Date': '2007-12-01', 'loan_rate_A': 7.695833333333333, 'trend': nan, 'seasonal': 0.08505178146324885, 'resid': nan}, {'Date': '2008-01-01', 'loan_rate_A': 7.961333333333333, 'trend': nan, 'seasonal': 0.06564185816692848, 'resid': nan}, {'Date': '2008-02-01', 'loan_rate_A': 8.130333333333333, 'trend': 8.005048767959094, 'seasonal': 0.008943337297934253, 'resid': 0.11634122807630445}, {'Date': '2008-03-01', 'loan_rate_A': 8.126285714285714, 'trend': 8.036669799705125, 'seasonal': -0.002099404440702811, 'resid': 0.09171531902129207},...
Seasonality Detection using ACF and PACF.
from validmind.model_validation.statsmodels.metrics import SeasonalityDetectionWithACF
test_context = TestContext(train_ds=vm_train_ds)
acf_metric = SeasonalityDetectionWithACF(test_context=test_context)
acf_metric.run()
acf_metric.result.show()Step 3: Residuals Analysis
Residuals series, histogram, Q-Q and ACF plots.
# Comment: How do I pass the residuals of seasonal decomponsition done before using SeasonalDecomposeMetricWithFigure?
from validmind.model_validation.statsmodels.metrics import ResidualsVisualInspection
test_context = TestContext(train_ds=vm_train_ds)
rvi_metric = ResidualsVisualInspection(test_context=test_context)
rvi_metric.run()rvi_metric.result.show()Test if Residuals are Normaly Distributed.
# Comment: How do I pass the residuals of seasonal decomponsition done before using SeasonalDecomposeMetricWithFigure?
vm.run_test_plan("normality_test_plan", train_ds=vm_train_ds, test_ds=vm_test_ds)Test if Residuals are Autocorrelated.
# Comment: How do I pass the residuals of seasonal decomponsition done before using SeasonalDecomposeMetricWithFigure?
vm.run_test_plan("autocorrelation_test_plan", train_ds=vm_train_ds, test_ds=vm_test_ds)Step 4: Test for seasonality using the Augmented Dickey-Fuller (ADF) test
Step 5: Analyze the seasonality test results
Step 6: Interpret the results
Step 7: Handle seasonality
4.2. Methodology Selection and Development
4.2.4 Variable Analysis
## A. Feature Analysis
A.1. Univariate Analysis
Visual Inspection
A.2 Multivariave Analysis
Visual Inspection
B. Variable Selection
ARIMA Analysis
Step 1: Identify the Integration order (Stationarity Analysis)
Unit Root Tests.
vm.run_test_plan("unit_root_test_plan", train_ds=vm_train_ds, test_ds=vm_test_ds)Step 2: Identify the AR order
Step 3: Identify the MA order
vm.run_test_plan("normality_test_plan", train_ds=vm_train_ds, test_ds=vm_test_ds)Run SeasonalDecomposeMetricWithFigure Test
test_context = TestContext(train_ds=vm_train_ds)
sd_metric = SeasonalDecomposeMetricWithFigure(test_context=test_context)
sd_metric.run()Run ResidualsVisualInspection Test
test_context = TestContext(train_ds=vm_train_ds, test_ds=vm_test_ds)
rvi_test = ResidualsVisualInspection(test_context=test_context)
rvi_test.run()vm.run_test_plan("seasonality_test_plan", train_ds=vm_train_ds, test_ds=vm_test_ds)Running Metric: seasonality_detection_with_acf: 50%|█████ | 1/2 [00:02<00:02, 2.68s/it]The default method 'yw' can produce PACF values outside of the [-1,1] interval. After 0.13, the default will change tounadjusted Yule-Walker ('ywm'). You can use this method now by setting method='ywm'.
Results for autocorrelation_test_plan Test Plan:
Logged the following evaluation metric to the ValidMind platform:
{'loan_rate_A': {'stat': 99.79377801179028, 'pvalue': 1.691207273732272e-23}, 'loan_rate_B': {'stat': 98.47643762617452, 'pvalue': 3.289150569642567e-23}, 'loan_rate_C': {'stat': 102.72120544262356, 'pvalue': 3.8579272650362314e-24}, 'loan_rate_D': {'stat': 102.06119689975492, 'pvalue': 5.383276667018511e-24}, 'FEDFUNDS': {'stat': 92.49617760172814, 'pvalue': 6.745493792730931e-22}, 'diff1_loan_rate_A': {'stat': 0.4402266706066844, 'pvalue': 0.5070130714060204}, 'diff1_loan_rate_B': {'stat': 9.03053378524947, 'pvalue': 0.0026550694107563377}, 'diff1_loan_rate_C': {'stat': 8.105512733213732, 'pvalue': 0.004413083679995846}, 'diff1_loan_rate_D': {'stat': 2.8385839077615724, 'pvalue': 0.09202528239248291}, 'diff1_FEDFUNDS': {'stat': 47.24114990634128, 'pvalue': 6.276875252119377e-12}, 'diff2_FEDFUNDS': {'stat': 2.412848114121581, 'pvalue': 0.12034323868942863}}
Logged the following evaluation metric to the ValidMind platform:
{'loan_rate_A': {'stat': 99.79377801179028, 'pvalue': 1.691207273732272e-23}, 'loan_rate_B': {'stat': 98.47643762617452, 'pvalue': 3.289150569642567e-23}, 'loan_rate_C': {'stat': 102.72120544262356, 'pvalue': 3.8579272650362314e-24}, 'loan_rate_D': {'stat': 102.06119689975492, 'pvalue': 5.383276667018511e-24}, 'FEDFUNDS': {'stat': 92.49617760172814, 'pvalue': 6.745493792730931e-22}, 'diff1_loan_rate_A': {'stat': 0.4402266706066844, 'pvalue': 0.5070130714060204}, 'diff1_loan_rate_B': {'stat': 9.03053378524947, 'pvalue': 0.0026550694107563377}, 'diff1_loan_rate_C': {'stat': 8.105512733213732, 'pvalue': 0.004413083679995846}, 'diff1_loan_rate_D': {'stat': 2.8385839077615724, 'pvalue': 0.09202528239248291}, 'diff1_FEDFUNDS': {'stat': 47.24114990634128, 'pvalue': 6.276875252119377e-12}, 'diff2_FEDFUNDS': {'stat': 2.412848114121581, 'pvalue': 0.12034323868942863}}
Logged the following evaluation metric to the ValidMind platform:
{'loan_rate_A': {'stat': -8.632956969312945, 'pvalue': 5.978593166876075e-18}, 'loan_rate_B': {'stat': -9.225267842093881, 'pvalue': 2.8285309180225434e-20}, 'loan_rate_C': {'stat': -9.408682037015677, 'pvalue': 5.0239675387170514e-21}, 'loan_rate_D': {'stat': -10.196439746918763, 'pvalue': 2.056741863262086e-24}, 'FEDFUNDS': {'stat': -10.094086373225906, 'pvalue': 5.8674887432668504e-24}, 'diff1_loan_rate_A': {'stat': 0.09804933849375877, 'pvalue': 0.9218931156254718}, 'diff1_loan_rate_B': {'stat': -0.6586877630844152, 'pvalue': 0.5100962925233736}, 'diff1_loan_rate_C': {'stat': -2.8948770301798645, 'pvalue': 0.0037930709179304486}, 'diff1_loan_rate_D': {'stat': -0.5392715515560418, 'pvalue': 0.589699495478653}, 'diff1_FEDFUNDS': {'stat': -8.465639721231678, 'pvalue': 2.5475051177682628e-17}, 'diff2_FEDFUNDS': {'stat': 1.8723472602592905, 'pvalue': 0.061158576378083265}}
Results for normality_test_plan Test Plan:
Logged the following evaluation metric to the ValidMind platform:
{'loan_rate_A': {'stat': 2.848172850453067, 'pvalue': 0.24072828607015606, 'skew': 0.3894929136110148, 'kurtosis': 2.821048639289756}, 'loan_rate_B': {'stat': 7.948577904082513, 'pvalue': 0.018792659227110216, 'skew': -0.1678688191978654, 'kurtosis': 1.7076614865010464}, 'loan_rate_C': {'stat': 1.4151119572602828, 'pvalue': 0.4928472559373931, 'skew': -0.2667218461199135, 'kurtosis': 2.818765023550964}, 'loan_rate_D': {'stat': 5.805990427261884, 'pvalue': 0.05485866032647532, 'skew': -0.3686251358879733, 'kurtosis': 2.1289430194058165}, 'FEDFUNDS': {'stat': 351.77897186101717, 'pvalue': 4.094179086914819e-77, 'skew': 2.807862376508006, 'kurtosis': 9.882392761343626}, 'diff1_loan_rate_A': {'stat': 49.69819494996339, 'pvalue': 1.615005799924691e-11, 'skew': -0.292775263452962, 'kurtosis': 6.287003081958673}, 'diff1_loan_rate_B': {'stat': 84.6129819949766, 'pvalue': 4.2317929537986954e-19, 'skew': -0.9659733072874014, 'kurtosis': 6.904637635216992}, 'diff1_loan_rate_C': {'stat': 13.59037152041562, 'pvalue': 0.00...
Logged the following evaluation metric to the ValidMind platform:
{'loan_rate_A': {'stat': 0.08132576894776711, 'pvalue': 0.08981725287412488}, 'loan_rate_B': {'stat': 0.12791454984124828, 'pvalue': 0.0009999999999998899}, 'loan_rate_C': {'stat': 0.13865442725902158, 'pvalue': 0.0009999999999998899}, 'loan_rate_D': {'stat': 0.12708648400842287, 'pvalue': 0.0009999999999998899}, 'FEDFUNDS': {'stat': 0.4185270469008754, 'pvalue': 0.0009999999999998899}, 'diff1_loan_rate_A': {'stat': 0.17590867451812156, 'pvalue': 0.0009999999999998899}, 'diff1_loan_rate_B': {'stat': 0.18838563953847634, 'pvalue': 0.0009999999999998899}, 'diff1_loan_rate_C': {'stat': 0.1810182591944418, 'pvalue': 0.0009999999999998899}, 'diff1_loan_rate_D': {'stat': 0.16004784597113098, 'pvalue': 0.0009999999999998899}, 'diff1_FEDFUNDS': {'stat': 0.3742488536123778, 'pvalue': 0.0009999999999998899}, 'diff2_FEDFUNDS': {'stat': 0.28686393808100397, 'pvalue': 0.0009999999999998899}}
Logged the following evaluation metric to the ValidMind platform:
{'loan_rate_A': {'stat': 0.975836992263794, 'pvalue': 0.04807629808783531}, 'loan_rate_B': {'stat': 0.9356268644332886, 'pvalue': 6.0137466789456084e-05}, 'loan_rate_C': {'stat': 0.9475300312042236, 'pvalue': 0.0003490431990940124}, 'loan_rate_D': {'stat': 0.9415098428726196, 'pvalue': 0.00014052187907509506}, 'FEDFUNDS': {'stat': 0.4598355293273926, 'pvalue': 4.666416175806635e-18}, 'diff1_loan_rate_A': {'stat': 0.8952228426933289, 'pvalue': 4.106910580503609e-07}, 'diff1_loan_rate_B': {'stat': 0.8786153793334961, 'pvalue': 7.289008863153867e-08}, 'diff1_loan_rate_C': {'stat': 0.9339078068733215, 'pvalue': 4.726010956801474e-05}, 'diff1_loan_rate_D': {'stat': 0.8760568499565125, 'pvalue': 5.656733037540107e-08}, 'diff1_FEDFUNDS': {'stat': 0.4879304766654968, 'pvalue': 1.3033836108341916e-17}, 'diff2_FEDFUNDS': {'stat': 0.59507155418396, 'pvalue': 1.008704643589071e-15}}
Logged the following evaluation metric to the ValidMind platform:
{'loan_rate_A': {'stat': 0.08132576894776711, 'pvalue': 0.08981725287412488}, 'loan_rate_B': {'stat': 0.12791454984124828, 'pvalue': 0.0009999999999998899}, 'loan_rate_C': {'stat': 0.13865442725902158, 'pvalue': 0.0009999999999998899}, 'loan_rate_D': {'stat': 0.12708648400842287, 'pvalue': 0.0009999999999998899}, 'FEDFUNDS': {'stat': 0.4185270469008754, 'pvalue': 0.0009999999999998899}, 'diff1_loan_rate_A': {'stat': 0.17590867451812156, 'pvalue': 0.0009999999999998899}, 'diff1_loan_rate_B': {'stat': 0.18838563953847634, 'pvalue': 0.0009999999999998899}, 'diff1_loan_rate_C': {'stat': 0.1810182591944418, 'pvalue': 0.0009999999999998899}, 'diff1_loan_rate_D': {'stat': 0.16004784597113098, 'pvalue': 0.0009999999999998899}, 'diff1_FEDFUNDS': {'stat': 0.3742488536123778, 'pvalue': 0.0009999999999998899}, 'diff2_FEDFUNDS': {'stat': 0.28686393808100397, 'pvalue': 0.0009999999999998899}}
Results for residuals_test_plan Test Plan:
Logged the following plots to the ValidMind platform:
Results for seasonality_test_plan Test Plan:
Logged the following evaluation metric to the ValidMind platform:
{'loan_rate_A': [{'Date': '2007-08-01', 'loan_rate_A': 7.7666666666666675, 'trend': nan, 'seasonal': -0.050284773390468364, 'resid': nan}, {'Date': '2007-09-01', 'loan_rate_A': 7.841428571428572, 'trend': nan, 'seasonal': -0.06087962072801926, 'resid': nan}, {'Date': '2007-10-01', 'loan_rate_A': 7.83, 'trend': nan, 'seasonal': 0.01749661199350169, 'resid': nan}, {'Date': '2007-11-01', 'loan_rate_A': 7.779090909090908, 'trend': nan, 'seasonal': -0.047258378330469565, 'resid': nan}, {'Date': '2007-12-01', 'loan_rate_A': 7.695833333333333, 'trend': nan, 'seasonal': 0.08505178146324885, 'resid': nan}, {'Date': '2008-01-01', 'loan_rate_A': 7.961333333333333, 'trend': nan, 'seasonal': 0.06564185816692848, 'resid': nan}, {'Date': '2008-02-01', 'loan_rate_A': 8.130333333333333, 'trend': 8.005048767959094, 'seasonal': 0.008943337297934253, 'resid': 0.11634122807630445}, {'Date': '2008-03-01', 'loan_rate_A': 8.126285714285714, 'trend': 8.036669799705125, 'seasonal': -0.002099404440702811, 'resid': 0.09171531902129207},...
Logged the following evaluation metric to the ValidMind platform:
{'loan_rate_A': {'acf_values': array([ 1. , 0.95235645, 0.89628237, 0.83365556, 0.78298876,
0.73759763, 0.68398844, 0.62657655, 0.56468994, 0.50331047,
0.42183716, 0.34023758, 0.25859584, 0.18191055, 0.11034449,
0.04315845, -0.0184955 , -0.0788836 , -0.12609771, -0.17004327,
-0.21018971, -0.24498837, -0.28174988, -0.31377348, -0.33500256,
-0.3343806 , -0.32206274, -0.30733837, -0.29602269, -0.28022519,
-0.26123446, -0.23725676, -0.22539932, -0.20575443, -0.18358424,
-0.16270934, -0.13889279, -0.11922772, -0.09506086, -0.06371048,
-0.03221894]), 'pacf_values': array([ 1.00000000e+00, 9.61340944e-01, -1.42725992e-01, -1.13803080e-01,
1.45601541e-01, 1.12916226e-02, -2.01201298e-01, -4.90737660e-02,
-4.91281155e-02, -7.22772152e-02, -4.00627990e-01, -1.93018777e-02,
-5.08939592e-03, -1.57441937e-01, -1.01953152e-01, 9.65840221e-02,
2.09312300e-02, -1.34464258e-01, 1.90609883e-01, 1.59998039e-01,
...